{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import datasets,preprocessing\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import r2_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.导入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_breast_cancer\n",
    "# 导入乳腺癌数据集\n",
    "breast_cancer = load_breast_cancer()\n",
    "# 定义数据、标签\n",
    "X = breast_cancer.data\n",
    "y = breast_cancer.target"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.1 数据划分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# 将数据集划分为训练集和测试集\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.2 数据变换操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# 使用标准化对数据进行预处理\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 特征选择 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "\n",
    "# 训练逻辑回归模型\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train_scaled, y_train)\n",
    "\n",
    "# 使用逻辑回归模型的系数进行特征选择\n",
    "sfm = SelectFromModel(lr, prefit=True)\n",
    "X_train_selected = sfm.transform(X_train_scaled)\n",
    "X_test_selected = sfm.transform(X_test_scaled)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. 模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1使用 l1 正则化项，更换为支持 l1 正则化的求解器，例如 liblinear。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(penalty='l1', solver='liblinear')"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# 训练逻辑回归模型，使用 liblinear 求解器\n",
    "lr = LogisticRegression(penalty='l1', solver='liblinear')\n",
    "lr.fit(X_train_selected, y_train)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. 模型评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9736842105263158\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "# 预测测试集标签\n",
    "y_pred = lr.predict(X_test_selected)\n",
    "\n",
    "# 计算准确率\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Accuracy: {accuracy}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. 交叉验证及超参数调优"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cross-validation scores: [1.         0.97802198 1.         0.97802198 0.95604396]\n",
      "Best parameters: {'C': 1, 'penalty': 'l1'}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score, GridSearchCV\n",
    "\n",
    "# 使用5折交叉验证评估逻辑回归模型\n",
    "scores = cross_val_score(lr, X_train_selected, y_train, cv=5)\n",
    "print(f\"Cross-validation scores: {scores}\")\n",
    "\n",
    "# 定义超参数网格\n",
    "param_grid = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}\n",
    "\n",
    "# 使用网格搜索进行超参数调优\n",
    "grid_search = GridSearchCV(lr, param_grid, cv=5)\n",
    "grid_search.fit(X_train_selected, y_train)\n",
    "\n",
    "# 获取最佳超参数\n",
    "best_params = grid_search.best_params_\n",
    "print(f\"Best parameters: {best_params}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将其中的 penalty='l1' 删除或替换为 penalty='l2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters: {'C': 10, 'penalty': 'l2'}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "# 定义超参数网格\n",
    "param_grid = {'C': [0.1, 1, 10], 'penalty': ['l2']}  # 删除 'l1'\n",
    "\n",
    "# 使用网格搜索进行超参数调优\n",
    "grid_search = GridSearchCV(lr, param_grid, cv=5)\n",
    "grid_search.fit(X_train_selected, y_train)\n",
    "\n",
    "# 获取最佳超参数\n",
    "best_params = grid_search.best_params_\n",
    "print(f\"Best parameters: {best_params}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
